library(bigMap)
Run pt-SNE on Sierpinski-3d with perplexity 2030 (99% of the data set size) and a decreasing range of thread-ratio. We show:
# source aux. stuff (graph plot function)
source('../graphs.R')
# load edge matrix
E <- as.matrix(read.csv('../sierpinski3d.edg', sep = '', header = F))
D <- as.matrix(read.csv('../sierpinski3d.mtx', sep = '', header = F))
g <- bdm.init(D, dSet.name = 'S3D', is.distance = T, ppx = 2030, threads = 4)
g1 <- bdm.ptsne(D, g, threads = 1, layers = 1)
g2 <- bdm.ptsne(D, g, threads = 3, layers = 2)
g3 <- bdm.ptsne(D, g, threads = 4, layers = 2)
g4 <- bdm.ptsne(D, g, threads = 5, layers = 2)
g5 <- bdm.ptsne(D, g, threads = 6, layers = 2)
g6 <- bdm.ptsne(D, g, threads = 8, layers = 2)
g.list <- list(g1, g2, g3, g4, g5, g6)
save(g.list, file = './s3d_2030.RData')
Note that the global structure is more or less preserved (we are using a very high perplexity) but the local structure gets worst as the thread-ratio is decreased.
nulL <- lapply(g.list, function(g) graph.plot(g, E))
In terms of HL-correlation the loss in structure (on average) is minimal.
g.list <- lapply(g.list, function(g) bdm.hlCorr(D, g, threads = 4))
save(g.list, file = './s3d_2030.RData')
hlTable <- sapply(g.list, function(g) summary(g$hlC))
hlTable <- t(round(hlTable, 4))
threadRatio <- sapply(g.list, function(g) g$ptsne$layers /g$ptsne$threads)
rownames(hlTable) <- round(threadRatio, 2)
knitr::kable(hlTable, caption = 'hl-Correlation by thread-ratio') %>%
kable_styling(full_width = F)
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | |
|---|---|---|---|---|---|---|
| 1 | 0.7383 | 0.7493 | 0.7547 | 0.7522 | 0.7576 | 0.7609 |
| 0.67 | 0.7403 | 0.7459 | 0.7477 | 0.7496 | 0.7515 | 0.7626 |
| 0.5 | 0.7463 | 0.7464 | 0.7509 | 0.7513 | 0.7559 | 0.7574 |
| 0.4 | 0.7373 | 0.7444 | 0.7502 | 0.7497 | 0.7555 | 0.7612 |
| 0.33 | 0.7549 | 0.7569 | 0.7579 | 0.7590 | 0.7599 | 0.7654 |
| 0.25 | 0.7402 | 0.7435 | 0.7463 | 0.7464 | 0.7491 | 0.7527 |
In terms of kNP, the global structure (linAUC) is notably preserved (with the exception of g5 with thread-ratio = 0.33, blue line) but the loss in local structre (logAUC) is clearly perceptible.
g.list <- lapply(g.list, function(g) bdm.knp(D, g, threads = 4))
save(g.list, file = './s3d_2030.RData')
bdm.knp.plot(g.list, ppxfrmt = 0)
Note the correlation between decreasing thread-ratio and decreasing running time.
rTimes <- sapply(g.list, function(g) c(g$ppx$t[3], g$t$epoch, g$t$ptsne[3], sum(c(g$ppx$t[3], g$t$ptsne[3]))))
rTimes <- round(rTimes, 2)
threadRatio <- sapply(g.list, function(g) g$ptsne$layers /g$ptsne$threads)
colnames(rTimes) <- round(threadRatio, 2)
rownames(rTimes) <- c('betas', 'epoch', 'ptSNE', 'total')
knitr::kable(rTimes, caption = 'Computation times (s) by thread-ratio') %>%
kable_styling(full_width = F)
| 1 | 0.67 | 0.5 | 0.4 | 0.33 | 0.25 | |
|---|---|---|---|---|---|---|
| betas | 0.38 | 0.38 | 0.38 | 0.38 | 0.38 | 0.38 |
| epoch | 1.89 | 0.86 | 0.54 | 0.48 | 0.40 | 0.29 |
| ptSNE | 62.85 | 27.84 | 17.01 | 15.12 | 12.41 | 9.02 |
| total | 63.22 | 28.21 | 17.39 | 15.50 | 12.79 | 9.39 |
Run on: Intel(R) Xeon(R) CPU E31225 @ 3.10GHz, 4 cores, 16GB RAM.
Let’s take g6 (perplexity 2030 and lowest thread-ratio=0.25) and apply EFC with different compression perplexities.
# take the embedding with lowest thread-ratio
g.2030c <- g.list[6:6]
# compress with ppx = 1845 (0.90)
g.2030c <- bdm.efc(D, g.2030c, ppx = 1845, iters = 50, threads = 4)
# compress with ppx = 1025 (0.50)
g.2030c <- bdm.efc(D, g.2030c, ppx = 1025, iters = 50, threads = 4)
# compress with ppx = 205 (0.10)
g.2030c <- bdm.efc(D, g.2030c, ppx = 205, iters = 50, threads = 4)
# compress with ppx = 102 (0.05)
g.2030c <- bdm.efc(D, g.2030c, ppx = 102, iters = 50, threads = 4)
# compress with ppx = 20 (0.01)
g.2030c <- bdm.efc(D, g.2030c, ppx = 20, iters = 50, threads = 4)
save(g.2030c, file = './s3d_2030c.RData')
# load embeddings with thread-ratio = 1.0 (from Fig.1)
load('../pt-SNE/glist.RData')
g.threadRatio1 <- g.list[c(13, 11, 7, 3, 2, 1)]
g <- g.threadRatio1[[1]]
graph.plot(g, E, title = 'ppx=2030, thread-ratio=1.0')
g <- g.2030c[[1]]
graph.plot(g, E, title = 'ppx=2030, thread-ratio=0.25')
Note that decreasing EFC perplexities lead to the same scale resolutions we got using these same perplexities and thread-reatio = 1.0 (outputs shown in Fig.1), but the global structure we already captured is preserved.
nulL <- lapply(2:6, function(i) {
g <- g.threadRatio1[[i]]
graph.plot(g, E, title = paste('ppx=', g$ppx$ppx, ', thread-ratio=1.0', sep = ''))
g <- g.2030c[[i]]
graph.plot(g, E, title = paste('ppx=2030, thread-ratio=0.25, +efc.', g$ppx$ppx, sep = ''))
})
The HL-correlation shows a decrease as we enhanced local structure while preserving global structure because short/long distances are not equally preserved.
g.2030c <- lapply(g.2030c, function(g) bdm.hlCorr(D, g, zSampleSize = 1000, threads = 4))
hlTable <- sapply(g.2030c, function(g) summary(g$hlC))
hlTable <- t(round(hlTable, 4))
efc.ppx <- paste('efc.', sapply(g.2030c, function(g) g$ppx$ppx), sep = '')
rownames(hlTable) <- efc.ppx
knitr::kable(hlTable, caption = 'hl-Correlation by EFC-perplexity') %>%
kable_styling(full_width = F)
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | |
|---|---|---|---|---|---|---|
| efc.2030 | 0.7513 | 0.7527 | 0.7544 | 0.7545 | 0.7561 | 0.7578 |
| efc.1845 | 0.7342 | 0.7374 | 0.7418 | 0.7430 | 0.7474 | 0.7541 |
| efc.1025 | 0.7215 | 0.7226 | 0.7264 | 0.7280 | 0.7318 | 0.7375 |
| efc.205 | 0.7214 | 0.7273 | 0.7313 | 0.7301 | 0.7341 | 0.7364 |
| efc.102 | 0.7162 | 0.7181 | 0.7199 | 0.7217 | 0.7235 | 0.7310 |
| efc.20 | 0.7115 | 0.7159 | 0.7249 | 0.7244 | 0.7334 | 0.7366 |
In terms of kNP we clearly observe that while preserving the global structure (linAUC), EFC results in a clear increase of local structure (logAUC).
g.2030c <- lapply(g.2030c, function(g) bdm.knp(D, g, k.max = NULL, sampling = 0.9, threads = 4))
bdm.knp.plot(g.2030c, ppxfrmt = 0)
The running time cost of EFC is obviously high, but a few iterations are usually enough to show some imprevement in the definition of the local structure.
rTimes <- sapply(g.2030c, function(g) c(g$ppx$t[3], g$t$epoch, g$t$ptsne[3], g$t$efc[3], sum(c(g$ppx$t[3], g$t$ptsne[3], g$t$efc[3]))))
rTimes <- round(rTimes, 2)
colnames(rTimes) <- sapply(g.2030c, function(g) g$ppx$ppx)
rownames(rTimes) <- c('betas', 'epoch', 'ptSNE', 'EFC', 'total')
knitr::kable(rTimes, caption = 'Computation times (s) by EFC-perplexity') %>%
kable_styling(full_width = F)
| 2030 | 1845 | 1025 | 205 | 102 | 20 | |
|---|---|---|---|---|---|---|
| betas | 0.38 | 0.45 | 0.56 | 0.34 | 0.14 | 0.11 |
| epoch | 0.29 | 0.29 | 0.29 | 0.29 | 0.29 | 0.29 |
| ptSNE | 9.02 | 9.02 | 9.02 | 9.02 | 9.02 | 9.02 |
| EFC | 0.00 | 19.76 | 19.23 | 18.83 | 19.15 | 19.19 |
| total | 9.39 | 29.23 | 28.81 | 28.19 | 28.31 | 28.32 |
Run on: Intel(R) Xeon(R) CPU E31225 @ 3.10GHz, 4 cores, 16GB RAM.